import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
import sklearn.metrics
from sklearn.metrics import confusion_matrix, roc_auc_score
df = pd.read_csv(r'C:\Users\gebra\Desktop\DS\Dataset\Titanic\titanic.csv')
df.head()
df.info()
df.shape
pandas_profiling.ProfileReport(df)
df.describe()
df.Sex.value_counts(dropna=False)
df.Ticket.value_counts(dropna=False)
df.Cabin.value_counts(dropna=False)
df.Embarked.value_counts(dropna=False)
nome = df["Name"].str.split(",").str.get(1)
df['Nome'] = nome.str.split(".").str.get(0)
df['Nome'].value_counts()
df = df.drop(columns=[
"PassengerId",
"Ticket",
"Name",
"Cabin"])
df.head()
df = pd.get_dummies(df)
df.columns
df.head()
df.info()
#A coluna sex_female ja indica se a pessoa é homem ou mulher, pois isso Sex_male sera removida
df = df.drop(columns="Sex_male")
df = pd.get_dummies(df, drop_first=True)
df.columns
y = df.Survived
X = df.drop(columns="Survived")
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)
num_cols = [
... "Pclass",
... "SibSp",
... "Parch",
... "Fare",
... "Age",
... "Sex_female",
... ]
X_train
X_train.isnull().sum()
X_test.isnull().sum()
meds = X_train.median()
X_train = X_train.fillna(meds)
X_test = X_test.fillna(meds)
X_train.isnull().sum()
X_test.isnull().sum()
sca = preprocessing.StandardScaler()
X_train = sca.fit_transform(X_train)
X_test = sca.transform(X_test)
X_train
X_test
modelos = [DummyClassifier,
LogisticRegression,
DecisionTreeClassifier,
KNeighborsClassifier,
GaussianNB,
RandomForestClassifier,
SVC,
]
modelos2 = ["DummyClassifier",
"LogisticRegression",
"DecisionTreeClassifier",
"KNeighborsClassifier",
"GaussianNB",
"RandomForestClassifier",
"SVC"]
k = 0
for modelo in modelos:
clf = modelo()
clf.fit(X_train, y_train)
resultado = clf.score(X_test, y_test, sample_weight=None)
print (modelos2[k],":",resultado)
k += 1
k = 0
for modelo in modelos:
clf = modelo()
clf.fit(X_train, y_train)
prev = clf.predict(X_test)
print (modelos2[k], ":", "\n", confusion_matrix(y_test, prev), "\n")
k += 1
Nesse caso foi ultilziado o segundo modelo com a melhor acuracia, pois o SVC deu erro ao treinar com a biblioteca yellowbrick
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ROCAUC
from yellowbrick.datasets import load_spam
# Instantiate the visualizer with the classification model
model = LogisticRegression()
visualizer = ROCAUC(model, classes=["Morreu", "Nao Morreu"])
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.show() # Finalize and show the figure
Esta parte sera dedicada para tentar criar uma tabela e comparar os diferentes tipos de algoritmos graficamente.
testarray = np.asarray(y_test)
DummyClassifier = DummyClassifier()
DummyClassifier.fit(X_train, y_train)
prev_DummyClassifier = DummyClassifier.predict(X_test)
grap_DummyClassifier = []
for w in range (len(testarray)):
if prev_DummyClassifier[w] == testarray[w]:
grap_DummyClassifier.append(0.2)
else:
grap_DummyClassifier.append(None)
LogisticRegression = LogisticRegression()
LogisticRegression.fit(X_train, y_train)
prev_LogisticRegression = LogisticRegression.predict(X_test)
grap_LogisticRegression = []
for w in range (len(testarray)):
if prev_LogisticRegression[w] == testarray[w]:
grap_LogisticRegression.append(0.4)
else:
grap_LogisticRegression.append(None)
DecisionTreeClassifier = DecisionTreeClassifier()
DecisionTreeClassifier.fit(X_train, y_train)
prev_DecisionTreeClassifier = DecisionTreeClassifier.predict(X_test)
grap_DecisionTreeClassifier = []
for w in range (len(testarray)):
if prev_DecisionTreeClassifier[w] == testarray[w]:
grap_DecisionTreeClassifier.append(0.6)
else:
grap_DecisionTreeClassifier.append(None)
KNeighborsClassifier = KNeighborsClassifier()
KNeighborsClassifier.fit(X_train, y_train)
prev_KNeighborsClassifier = KNeighborsClassifier.predict(X_test)
grap_KNeighborsClassifier = []
for w in range (len(testarray)):
if prev_KNeighborsClassifier[w] == testarray[w]:
grap_KNeighborsClassifier.append(0.8)
else:
grap_KNeighborsClassifier.append(None)
GaussianNB = GaussianNB()
GaussianNB.fit(X_train, y_train)
prev_GaussianNB = GaussianNB.predict(X_test)
grap_GaussianNB = []
for w in range (len(testarray)):
if prev_GaussianNB[w] == testarray[w]:
grap_GaussianNB.append(1)
else:
grap_GaussianNB.append(None)
RandomForestClassifier = RandomForestClassifier()
RandomForestClassifier.fit(X_train, y_train)
prev_RandomForestClassifier = RandomForestClassifier.predict(X_test)
grap_RandomForestClassifier = []
for w in range (len(testarray)):
if prev_RandomForestClassifier[w] == testarray[w]:
grap_RandomForestClassifier.append(1.2)
else:
grap_RandomForestClassifier.append(None)
SVC = SVC()
SVC.fit(X_train, y_train)
prev_SVC = SVC.predict(X_test)
grap_SVC = []
for w in range (len(testarray)):
if prev_SVC[w] == testarray[w]:
grap_SVC.append(1.4)
else:
grap_SVC.append(None)
Estrela = []
for x in range (len(grap_SVC)):
if grap_DummyClassifier[x] != None and grap_LogisticRegression[x] != None and grap_DecisionTreeClassifier[x] != None and grap_KNeighborsClassifier[x] != None and grap_GaussianNB[x] != None and grap_RandomForestClassifier[x] != None and grap_SVC[x] != None :
Estrela.append(1.6)
else:
Estrela.append(None)
plt.figure(figsize=(20,10))
x = np.arange(268)
plt.plot(x, grap_DummyClassifier, 'bo', label='DummyClassifier')
plt.plot(x, grap_LogisticRegression, 'ro', label='LogisticRegression')
plt.plot(x, grap_DecisionTreeClassifier, 'go', label='DecisionTreeClassifier')
plt.plot(x, grap_KNeighborsClassifier, 'co', label='KNeighborsClassifier')
plt.plot(x, grap_GaussianNB, 'yo', label='GaussianNB')
plt.plot(x, grap_RandomForestClassifier, 'mo', label='RandomForestClassifier')
plt.plot(x, grap_SVC, 'ko', label='SVC')
plt.plot(x, Estrela, 'y*', label='Todos Acertaram', markersize=12)
plt.title('Comparacao entre os modelos')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 0))
plt.rcParams.update({'font.size': 30})